library("dplyr")
library("phyloseq")
library("SNFtool")
library("pheatmap")
library("igraph")
library("tibble")
library("RCy3")
library("igraph")
library(microbiome)
library(arcdiagram)
library(MetaNet)
library(PLNmodels)
library(ggplot2)
library(future)
library(factoextra)
library(tidyr)
plan(multisession, workers = 15)merged_metagenomes <- phyloseq::import_biom("../data/fastq/kraken/bracken//merge_species.biom")
meta <- readxl::read_excel("../data/Glossina_metadata.xlsx")
# Set the new column names in the phyloseq object
sample_names(merged_metagenomes) <- gsub("_.*$|\\.kraken_report_bracken_genuses$", "", sample_names(merged_metagenomes))
sample_names(merged_metagenomes) <- gsub("^Gl", "GI", sample_names(merged_metagenomes))
# Sort the 'meta' data frame by the 'SRA.identifier' column
meta$Sample <- gsub("_.*$", "", meta$Sample)
meta$Samples <- meta$Sample
meta <- meta %>% column_to_rownames(var = "Samples")
# meta$Sample == sample_names(merged_metagenomes)
# Associate the sorted metadata to the phyloseq object as sample data
merged_metagenomes@sam_data <- sample_data(meta)
# Remove the unnecessary 'k_' prefix in the taxonomy data
merged_metagenomes@tax_table@.Data <- substring(merged_metagenomes@tax_table@.Data, 4)
# Rename the columns of the taxonomy table to represent taxonomic ranks
colnames(merged_metagenomes@tax_table@.Data) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
#Keep only the kingdom of interest
merged_metagenomes <- subset_taxa(merged_metagenomes, Kingdom =="Bacteria")
# head(sample_data(merged_metagenomes))
# head(tax_table(merged_metagenomes))
# Define the list of samples to remove
samples_to_remove <- c("GI-103", "GI-59", "GI-104", "GI-111", "GI-121",
"GI-124", "GI-125", "GI-14", "GI-15", "GI-16",
"GI-17", "GI-25", "GI-26", "GI-27", "GI-28",
"GI-29", "GI-30", "GI-31", "GI-32", "GI-33",
"GI-34", "GI-36", "GI-38", "GI-39", "GI-45", "GI-47", "GI-51", "I9", "I7")
# Prune these samples from your phyloseq object
merged_metagenomes <- prune_samples(!(sample_names(merged_metagenomes) %in% samples_to_remove), merged_metagenomes)
physeq <- microbiome::aggregate_rare(merged_metagenomes, level = "Genus", detection = 0.01/100, prevalence = 5/100)# Identify OTUs with zero counts
empty_otus <- taxa_names(physeq)[taxa_sums(physeq) == 0]
# Print the empty OTU names
if (length(empty_otus) > 0) {
print(paste("Empty OTUs:", paste(empty_otus, collapse = ", ")))
} else {
print("No empty OTUs.")
}## [1] "No empty OTUs."
# Identify samples with zero counts
empty_samples <- sample_names(physeq)[sample_sums(physeq) == 0]
# Print the empty sample names
if (length(empty_samples) > 0) {
print(paste("Empty samples:", paste(empty_samples, collapse = ", ")))
} else {
print("No empty samples.")
}## [1] "No empty samples."
# Subset phyloseq object for female samples
physeq_F <- subset_samples(physeq, sex == "F")
samples_to_remove <- c("GI-45", "GI-51")
physeq_F <- prune_samples(!(sample_names(physeq_F) %in% samples_to_remove), physeq_F)
# Remove samples with no counts
physeq_F <- prune_samples(sample_sums(physeq_F) > 0, physeq_F)
# Remove OTUs with no counts
physeq_F <- prune_taxa(taxa_sums(physeq_F) > 0, physeq_F)
data_F <- prepare_data(counts = physeq_F@otu_table, covariates = physeq_F@sam_data, offset = "Wrench")
# Subset phyloseq object for female samples
physeq_M <- subset_samples(physeq, sex == "M")
physeq_M <- prune_samples(!(sample_names(physeq_M) %in% samples_to_remove), physeq_M)
data_M <- prepare_data(counts = physeq_M@otu_table, covariates = physeq_M@sam_data, offset = "Wrench")# Identify OTUs with zero counts
empty_otus <- taxa_names(physeq_F)[taxa_sums(physeq_F) == 0]
# Print the empty OTU names
if (length(empty_otus) > 0) {
print(paste("Empty OTUs:", paste(empty_otus, collapse = ", ")))
} else {
print("physeq_F: No empty OTUs.")
}## [1] "physeq_F: No empty OTUs."
# Identify samples with zero counts
empty_samples <- sample_names(physeq_F)[sample_sums(physeq_F) == 0]
# Print the empty sample names
if (length(empty_samples) > 0) {
print(paste("Empty samples:", paste(empty_samples, collapse = ", ")))
} else {
print("physeq_F: No empty samples.")
}## [1] "physeq_F: No empty samples."
# # Remove samples with no counts
# physeq_F <- prune_samples(sample_sums(physeq_F) > 0, physeq)
# # Remove OTUs with no counts
# physeq_F <- prune_taxa(taxa_sums(physeq_F) > 0, physeq)
# Identify OTUs with zero counts
empty_otus <- taxa_names(physeq_F)[taxa_sums(physeq_M) == 0]
# Print the empty OTU names
if (length(empty_otus) > 0) {
print(paste("Empty OTUs:", paste(empty_otus, collapse = ", ")))
} else {
print("physeq_M: No empty OTUs.")
}## [1] "physeq_M: No empty OTUs."
# Identify samples with zero counts
empty_samples <- sample_names(physeq_F)[sample_sums(physeq_M) == 0]
# Print the empty sample names
if (length(empty_samples) > 0) {
print(paste("Empty samples:", paste(empty_samples, collapse = ", ")))
} else {
print("physeq_M: No empty samples.")
}## [1] "physeq_M: No empty samples."
We have a clear Poisson distribution.
hist(as.matrix(as.data.frame(physeq@otu_table), nclass = 10, main = "Abundance of the OTU data", xlab = "values"))The Poisson lognormal model is a statistical model used to describe count data that exhibits overdispersion, which means the variance is greater than the mean. This model combines two components: a Poisson distribution and a lognormal distribution.
By combining the Poisson and lognormal distributions, this model provides a flexible approach to modeling count data that exhibit greater variability than what can be captured by a simple Poisson distribution alone.
Intercept (~1): - There is a common baseline level of abundance for taxa in the microbiome samples, and zero abundance represents a biological state (not a data artifact), then Abundance ~ 1 + offset(log(Offset)) is appropriate. The intercept allows to model this baseline. - Implicitly assume that all taxa have some baseline level of abundance, even if some taxa may have zero abundance in certain samples. Whith this model, we are looking at how each taxon’s abundance varies around this assumed baseline level.
No Intercept (~0): - Zero abundance represents the true absence of a taxon and there isn’t a common baseline abundance that all taxa share, then Abundance ~ 0 + offset(log(Offset)) might be more appropriate. This formulation explicitly models the starting point as zero. - Baseline Level: This refers to a hypothetical or observed starting point or average level of abundance across all taxa in your microbiome samples. It represents a point of comparison against which you assess changes or differences. - This approach is useful when zero abundance values (where taxa are absent) are not due to technical limitations but rather reflect genuine biological absence. It allows to interpret changes in abundance levels in relation to what you consider to be the typical or expected level of abundance for each taxon.
Offset (offset(log(Offset))): - Purpose: The offset term adjusts for differences in exposure or scaling factors that are known and fixed for each observation - Example: In microbiome studies, offsets are often used to account for differences in sequencing depth or sampling effort across samples. The logarithm transformation (log(Offset)) is common to handle the typical skewness in abundance data.
We can test both models (~1 and ~0) and evaluate their goodness of fit metrics (like AIC or BIC) to determine which one better captures the variability in our data.
##
## Initialization...
## Adjusting a full covariance PLN model with nlopt optimizer
## Post-treatments...
## DONE!
##
## Initialization...
## Adjusting a full covariance PLN model with nlopt optimizer
## Post-treatments...
## DONE!
rbind(
"No Intercept" = Model_01_0$criteria,
"With Intercept" = Model_02_1$criteria
) %>% knitr::kable()| nb_param | loglik | BIC | ICL | |
|---|---|---|---|---|
| No Intercept | 378 | -2450.234 | -3185.788 | -4940.717 |
| With Intercept | 405 | -2376.152 | -3164.245 | -4792.277 |
With Intercept (PLN(PLN_data$Abundance ~ 1 + offset(log(Offset)), PLN_data)) appears to be the better model choice among the two based on
the given criteria.
##
## Initialization...
## Adjusting a full covariance PLN model with nlopt optimizer
## Post-treatments...
## DONE!
##
## Initialization...
## Adjusting a full covariance PLN model with nlopt optimizer
## Post-treatments...
## DONE!
##
## Initialization...
## Adjusting a full covariance PLN model with nlopt optimizer
## Post-treatments...
## DONE!
##
## Initialization...
## Adjusting a full covariance PLN model with nlopt optimizer
## Post-treatments...
## DONE!
##
## Initialization...
## Adjusting a full covariance PLN model with nlopt optimizer
## Post-treatments...
## DONE!
##
## Initialization...
## Adjusting a full covariance PLN model with nlopt optimizer
## Post-treatments...
## DONE!
results <- rbind(
"PLN_data" = Model_01$criteria,
"PLN_data Offset" = Model_02$criteria,
"PLN_data sex" = Model_03$criteria,
"PLN_data Ecological gradient" = Model_04$criteria,
"PLN_data Caracteristic Gradient" = Model_05$criteria,
"PLN_data Infection" = Model_06$criteria
)
print(results) %>%
arrange(loglik) %>%
knitr::kable()## nb_param loglik BIC ICL
## PLN_data 405 -2379.161 -3167.254 -4873.842
## PLN_data Offset 405 -2376.152 -3164.245 -4792.277
## PLN_data sex 432 -2368.872 -3209.505 -4827.396
## PLN_data Ecological gradient 486 -2311.695 -3257.407 -4808.464
## PLN_data Caracteristic Gradient 486 -2334.568 -3280.280 -4699.698
## PLN_data Infection 432 -2365.896 -3206.529 -4759.633
| nb_param | loglik | BIC | ICL | |
|---|---|---|---|---|
| PLN_data | 405 | -2379.161 | -3167.254 | -4873.842 |
| PLN_data Offset | 405 | -2376.152 | -3164.245 | -4792.277 |
| PLN_data sex | 432 | -2368.872 | -3209.505 | -4827.396 |
| PLN_data Infection | 432 | -2365.896 | -3206.529 | -4759.633 |
| PLN_data Caracteristic Gradient | 486 | -2334.568 | -3280.280 | -4699.698 |
| PLN_data Ecological gradient | 486 | -2311.695 | -3257.407 | -4808.464 |
Selected PLN_data Ecological gradient.
One can access the fitted value of the counts (Abundance – Y^) and check that the algorithm basically learned correctly from the data:
data.frame(
fitted = as.vector(fitted(Model_06)),
observed = as.vector(PLN_data$Abundance)
) %>%
ggplot(aes(x = observed, y = fitted)) +
geom_point(size = .5, alpha = .25) +
scale_x_log10() +
scale_y_log10() +
theme_bw() +
annotation_logticks()## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
The residual correlation matrix better displays as an image matrix:
myPLN_weighted <-
PLN(
PLN_data$Abundance ~ 1 + Ecological.gradient + offset(log(Offset)),
data = PLN_data,
weights = runif(nrow(PLN_data)),
control = PLN_param(trace = 0)
)
data.frame(
unweighted = as.vector(fitted(Model_06)),
weighted = as.vector(fitted(myPLN_weighted))
) %>%
ggplot(aes(x = unweighted, y = weighted)) +
geom_point(size = .5, alpha = .25) +
scale_x_log10(
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x))
) +
scale_y_log10(
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x))
) +
theme_bw() +
annotation_logticks() +
labs(x = "Unweighted Fitted Values", y = "Weighted Fitted Values")Effect of Small Weights: Observations with very small weights might be receiving much less influence in the weighted model. If an observation has a weight close to zero, its contribution to the fitted values will be minimal. This can lead to significant deviations between the unweighted and weighted models, especially if the unweighted model provides a much larger fitted value for those observations.
General Linear Trend: Most observations align with a linear relationship, indicating that the weights are generally applied in a consistent manner across the dataset. Distinct Clouds: The additional clouds suggest that specific subsets of data behave differently, either due to the extreme or moderate influence of weights, unique characteristics of those observations, or potential data issues.
PLN_data <- PLN_data %>% as.data.frame()
# Extract unweighted fitted values
a <- fitted(Model_06) %>% as.data.frame()
a <- a %>%
mutate(Sample = rownames(PLN_data)) %>%
column_to_rownames("Sample")
colnames(a) <- PLN_data$Abundance %>% colnames()
unweighted_df <- a %>%
mutate(Ecological.gradient = PLN_data$Ecological.gradient) %>%
rownames_to_column("Sample") %>%
pivot_longer(-c(Sample, Ecological.gradient), names_to = "Variable", values_to = "Unweighted")
# Extract weighted fitted values
b <- fitted(myPLN_weighted) %>% as.data.frame()
b <- b %>%
mutate(Sample = rownames(PLN_data)) %>%
column_to_rownames("Sample")
colnames(b) <- PLN_data$Abundance %>% colnames()
weighted_df <- b %>%
mutate(Ecological.gradient = PLN_data$Ecological.gradient) %>%
rownames_to_column("Sample") %>%
pivot_longer(-c(Sample, Ecological.gradient), names_to = "Variable", values_to = "Weighted")
# Merge the unweighted and weighted data
merged_df <- unweighted_df %>%
select(Sample, Variable, Unweighted) %>%
dplyr::left_join(weighted_df, by = c("Sample", "Variable")) %>%
select(Sample, Variable, Unweighted, Weighted, Ecological.gradient)
# Inspect the merged data frame
head(merged_df)## # A tibble: 6 × 5
## Sample Variable Unweighted Weighted Ecological.gradient
## <chr> <chr> <dbl> <dbl> <chr>
## 1 GI-52 Acinetobacter 0.00238 0.0168 ecotone
## 2 GI-52 Allorhizobium-Neorhizobium-Par… 0.0258 0.00118 ecotone
## 3 GI-52 Anoxybacillus 0.0552 0.160 ecotone
## 4 GI-52 Arsenophonus 0.0956 0.103 ecotone
## 5 GI-52 Bacillus 0.133 0.832 ecotone
## 6 GI-52 Buchnera 34.0 35.6 ecotone
# Define your color palette
colors <- c(
"wild" = "#009392",
"ecotone" = "#9ccb86",
"farmed land" = "#EEB479",
"urban" = "#CF597E"
)
# Plot with transparency
ggplot(merged_df, aes(x = Unweighted, y = Weighted, color = Ecological.gradient)) +
geom_point(size = 1, alpha = 0.4) + # Adjust alpha for transparency
scale_x_log10() +
scale_y_log10() +
scale_color_manual(values = colors) +
theme_bw() +
labs(
x = "Unweighted Fitted Value",
y = "Weighted Fitted Value",
color = "Ecological.gradient"
) +
theme(legend.position = "bottom")results_df <- data.frame(
unweighted = as.vector(fitted(Model_06)),
weighted = as.vector(fitted(myPLN_weighted)),
"Sample" = rownames(PLN_data) # Assuming sample IDs are rownames or add another identifier
)
# Filter the dataframe based on the conditions
filtered_samples <- merged_df %>% filter(Unweighted > 1e-5 & Weighted < 1e-4)
# Print the filtered samples
head(filtered_samples)## # A tibble: 6 × 5
## Sample Variable Unweighted Weighted Ecological.gradient
## <chr> <chr> <dbl> <dbl> <chr>
## 1 GI-61 Candidatus Baumannia 0.212 0.0000445 farmed land
## 2 GI-61 Candidatus Ishikawaella 0.0461 0.0000450 farmed land
## 3 GI-62 Arsenophonus 0.00235 0.0000487 farmed land
## 4 GI-62 Candidatus Baumannia 0.0200 0.0000000851 farmed land
## 5 GI-62 Candidatus Ishikawaella 0.00144 0.0000000786 farmed land
## 6 GI-62 Halomonas 0.000328 0.0000123 farmed land
# Filter the dataframe based on the conditions
filtered_samples <- merged_df %>% filter(Unweighted < 1e-4 & Weighted > 1e-6)
# Print the filtered samples
head(filtered_samples)## # A tibble: 6 × 5
## Sample Variable Unweighted Weighted Ecological.gradient
## <chr> <chr> <dbl> <dbl> <chr>
## 1 GI-53 Cupriavidus 5.12e-7 3.37e-4 urban
## 2 GI-54 Cupriavidus 5.35e-7 2.13e-4 ecotone
## 3 GI-56 Allorhizobium-Neorhizobium-Par… 6.15e-5 5.26e-3 ecotone
## 4 GI-56 Cupriavidus 2.26e-9 1.91e-4 ecotone
## 5 GI-62 Cupriavidus 9.58e-9 5.24e-5 farmed land
## 6 GI-64 Jeotgalibaca 3.70e-5 3.77e-4 farmed land
ticks_pca_clim <- PLNPCA(
formula = PLN_data$Abundance ~ 1 +
Ecological.gradient +
offset(log(Offset)),
data = PLN_data,
ranks = 1:20
)##
## Initialization...
##
## Adjusting 20 PLN models for PCA analysis.
## Rank approximation = 2 Rank approximation = 1 Rank approximation = 16 Rank approximation = 13 Rank approximation = 20 Rank approximation = 17 Rank approximation = 12 Rank approximation = 3 Rank approximation = 8 Rank approximation = 14 Rank approximation = 11 Rank approximation = 4 Rank approximation = 7 Rank approximation = 19 Rank approximation = 18 Rank approximation = 15 Rank approximation = 10 Rank approximation = 6 Rank approximation = 5 Rank approximation = 9
## Post-treatments
## DONE!
sigma(ticks_pca_clim_best) %>%
corrplot::corrplot(is.corr = FALSE, tl.cex = 0.5, col = corrplot::COL2("PiYG"), type = "lower", diag = T)# Define the threshold function
keep_high_correlations <- function(x) {
any(x > 2 | x < -1)
}
########################### Climatic ##################
rows_to_keep <- apply(sigma(ticks_pca_clim_best), 1, keep_high_correlations)
cols_to_keep <- apply(sigma(ticks_pca_clim_best), 2, keep_high_correlations)
# Subset the matrix to keep only the selected rows and columns
ticks_pca_clim_best_filtered <- sigma(ticks_pca_clim_best)[rows_to_keep, cols_to_keep]
# Print the head of the filtered matrix to check
# head(oaks_best_pca_filtered)
ticks_pca_clim_best_filtered %>%
corrplot::corrplot(is.corr = FALSE, tl.cex = 0.4, col = corrplot::COL2("PiYG"), type = "lower", diag = T)factoextra::fviz_pca_biplot(ticks_pca_clim_best,
select.var = list(contrib = 10),
labels = NULL,
geom = "point"
) # CONTRIBUTION TOP 20 CLUSTER.fviz_pca_var(ticks_pca_clim_best,
col.var = "contrib",
gradient.cols = c("#00AFBB",
"#E7B800",
"#FC4E07"),
select.var = list(contrib = 20)
)ticks_pca_clim_best$var %>%
as.data.frame() %>%
select(contrib.Dim.1, contrib.Dim.2) %>%
arrange(contrib.Dim.1) %>%
tail(n = 10) %>%
knitr::kable()| contrib.Dim.1 | contrib.Dim.2 | |
|---|---|---|
| Staphylococcus | 4.275688 | 0.2012983 |
| Planococcus | 5.210675 | 6.6617925 |
| Streptococcus | 5.665843 | 0.0640939 |
| Fictibacillus | 6.018175 | 2.7755791 |
| Enterococcus | 6.122822 | 2.3719877 |
| Anoxybacillus | 6.426533 | 0.4444764 |
| Halomonas | 7.196764 | 0.0476194 |
| Arsenophonus | 7.342354 | 13.4084864 |
| Candidatus Ishikawaella | 11.073589 | 13.1191328 |
| Candidatus Baumannia | 13.497340 | 7.8311907 |
networks_full <- PLNnetwork(formula = PLN_data$Abundance ~ 1 +
Ecological.gradient +
offset(log(Offset)), data = PLN_data)##
## Initialization...
## Adjusting 30 PLN with sparse inverse covariance estimation
## Joint optimization alternating gradient descent and graphical-lasso
## sparsifying penalty = 44.26583 sparsifying penalty = 40.88706 sparsifying penalty = 37.76618 sparsifying penalty = 34.88352 sparsifying penalty = 32.22089 sparsifying penalty = 29.7615 sparsifying penalty = 27.48983 sparsifying penalty = 25.39155 sparsifying penalty = 23.45344 sparsifying penalty = 21.66326 sparsifying penalty = 20.00972 sparsifying penalty = 18.48239 sparsifying penalty = 17.07165 sparsifying penalty = 15.76859 sparsifying penalty = 14.56498 sparsifying penalty = 13.45325 sparsifying penalty = 12.42637 sparsifying penalty = 11.47788 sparsifying penalty = 10.60178 sparsifying penalty = 9.792559 sparsifying penalty = 9.045101 sparsifying penalty = 8.354696 sparsifying penalty = 7.716989 sparsifying penalty = 7.127958 sparsifying penalty = 6.583887 sparsifying penalty = 6.081345 sparsifying penalty = 5.617161 sparsifying penalty = 5.188408 sparsifying penalty = 4.792381 sparsifying penalty = 4.426583
## Post-treatments
## DONE!
| param | nb_param | loglik | BIC | ICL | n_edges | EBIC | pen_loglik | density | stability |
|---|---|---|---|---|---|---|---|---|---|
| 44.26583 | 135 | -3764.443 | -4027.141 | -5578.442 | 0 | -4027.141 | -3785.435 | 0.000000 | NA |
| 40.88706 | 135 | -3732.180 | -3994.878 | -5546.361 | 0 | -3994.878 | -3752.830 | 0.000000 | NA |
| 37.76618 | 135 | -3701.010 | -3963.707 | -5515.199 | 0 | -3963.707 | -3721.305 | 0.000000 | NA |
| 34.88352 | 135 | -3670.830 | -3933.527 | -5485.028 | 0 | -3933.527 | -3690.760 | 0.000000 | NA |
| 32.22089 | 135 | -3641.636 | -3904.334 | -5455.858 | 0 | -3904.334 | -3661.190 | 0.000000 | NA |
| 29.76150 | 137 | -3612.872 | -3879.462 | -5431.025 | 2 | -3884.630 | -3632.061 | 0.005487 | NA |
A diagnostic of the optimization process is available via the `convergence field:
| param | nb_param | status | backend | objective | iterations | convergence | |
|---|---|---|---|---|---|---|---|
| out | 44.26583 | 135 | 3 | nlopt | 3764.442793 | 2.000000 | 0.000007 |
| elt | 40.88706 | 135 | 4 | nlopt | 3732.180189 | 3.000000 | 0.000004 |
| elt.1 | 37.76618 | 135 | 4 | nlopt | 3701.009586 | 2.000000 | 0.000001 |
| elt.2 | 34.88352 | 135 | 4 | nlopt | 3670.829585 | 2.000000 | 0.000000 |
| elt.3 | 32.22089 | 135 | 3 | nlopt | 3641.636475 | 2.000000 | 0.000003 |
| elt.4 | 29.76150 | 137 | 4 | nlopt | 3612.872379 | 2.000000 | 0.000002 |
convergence: This column shows the convergence tolerance or criterion. It represents how close the algorithm was to meeting the convergence criteria, with smaller values indicating closer adherence to the convergence criteria. Values close to zero (like the ones shown here) indicate that the models have converged successfully.
coefficient_path(networks_full, corr = TRUE) %>%
ggplot(aes(x = Penalty, y = Coeff, group = Edge, colour = Edge)) +
geom_line(show.legend = FALSE) +
coord_trans(x = "log10") +
theme_bw()How to Interpret:
##
## Stability Selection for PLNnetwork:
## subsampling: ++++++++++++++++++++
## Poisson Lognormal with sparse inverse covariance (penalty = 14.6)
## ==================================================================
## nb_param loglik BIC ICL n_edges EBIC pen_loglik density
## 154 -3368.619 -3668.289 -5220.122 19 -3695.994 -3385.108 0.052
## ==================================================================
## * Useful fields
## $model_par, $latent, $latent_pos, $var_par, $optim_par
## $loglik, $BIC, $ICL, $loglik_vec, $nb_param, $criteria
## * Useful S3 methods
## print(), coef(), sigma(), vcov(), fitted()
## predict(), predict_cond(), standard_error()
## * Additional fields for sparse network
## $EBIC, $density, $penalty
## * Additional S3 methods for network
## plot.PLNnetworkfit()
We can finally check that the fitted value of the counts – even with sparse regularization of the covariance matrix – are close to the observed ones:
data.frame(
fitted = as.vector(fitted(model_StARS)),
observed = as.vector(PLN_data$Abundance)
) %>%
ggplot(aes(x = observed, y = fitted)) +
geom_point(size = .5, alpha = .25) +
scale_x_log10(limits = c(1, 4000)) +
scale_y_log10(limits = c(1, 4000)) +
theme_bw() +
annotation_logticks()## Warning in scale_x_log10(limits = c(1, 4000)): log-10 transformation introduced
## infinite values.
## Warning: Removed 1046 rows containing missing values or values outside the scale range
## (`geom_point()`).
ticks_best_network_2 <- model_StARS
my_graph <- plot(ticks_best_network_2, plot = F)
Isolated <- which(igraph::degree(my_graph) == 0)
my_graph_cleaned <- delete_vertices(my_graph, Isolated)
my_graph <- my_graph_cleaned
vertices_names <- V(my_graph)$name
result <- merged_metagenomes@tax_table %>%
as.data.frame() %>%
filter(Genus %in% vertices_names) %>%
select(Genus, Family)
# Extract unique families
unique_families <- unique(result$Family)
# Generate distinct colors for each family
n_colors <- length(unique_families)
colors <- khroma::colour("smooth rainbow")(n_colors)
# Create a named vector to map each family to a color
family_colors <- setNames(colors, unique_families)
# Add the color information to the result dataframe
result <- result %>%
mutate(Color = family_colors[Family])
# extract abundance OTU
otu_table_df <- as.data.frame(otu_table(physeq))
abundance <- otu_table_df %>%
mutate(Abundance = rowSums(.)) %>%
rownames_to_column("vertices_left")
abundance_solo <- abundance %>%
as.data.frame() %>%
filter(vertices_left %in% vertices_names) %>%
select(Abundance)
# # Convert the Abundance column to a numeric vector
# vertex_sizes <- as.numeric(abundance_solo$Abundance)
# # Scale the vertex sizes for better visibility
# vertex_sizes <- (vertex_sizes / max(vertex_sizes)) * 10 # Adjust the multiplier as needed
# Assign colors to the vertices
V(my_graph)$color <- result$Color
# Assign sizes to the vertices
# V(my_graph)$size <- vertex_sizes
# Choose a layout that spreads out the vertices
layout <- igraph::layout_in_circle(my_graph)
# Define unique colors and families for the legend
unique_colors <- unique(result$Color)
unique_families <- unique(result$Family)
# Define edge width for the legend
edge_widths <- E(my_graph)$width
unique_edge_widths <- unique(edge_widths)
# Edge - Arc colors
weight <- as.numeric(E(my_graph)$weight)
color_palette <- colorRampPalette(c("yellow", "red"))(200)
normalized_weights <- (weight - min(weight)) / (max(weight) - min(weight))
edge_colors <- sapply(seq_along(normalized_weights), function(i) {
if (i == 97) {
"#009fff"
} else {
color_palette[as.integer(normalized_weights[i] * (length(color_palette) - 1)) + 1]
}
})
edge_attr(my_graph)$color <- edge_colors
E(my_graph)$color <- edge_colors
plot(simplify(my_graph),
layout = layout,
vertex.size = (V(my_graph)$size) * 2,
vertex.label.dist = 1,
vertex.label.cex = 0.5,
edge.width = E(my_graph)$width,
edge.color = E(my_graph)$color,
margin = 0.2,
main = "Network based on the GLM ~ month + Offset by sex group",
sub = "Stability threashold 0.20 - Verticles colored by families, Edges colored by weight",
rescale = T
)
legend("left",
legend = unique_families,
col = unique_colors,
pch = 19,
title = "Families",
pt.cex = 2,
cex = 0.5,
bty = "n",
y.intersp = 0.6,
inset = 1 / 10
)## a 'v_group'-'v_class' should not match multiple colors, update 'color'.
## a 'e_type' should not match multiple colors, update 'color'.
V(my_graph_metaN)$v_group <- result$Family
V(my_graph_metaN)$color <- V(my_graph)$color
V(my_graph_metaN)$size <- vertex_attr(my_graph)$size
V(my_graph_metaN)$v_class <- result$Family
E(my_graph_metaN)$color <- E(my_graph)$color
# Set the background color to black using par and text parameters
par(bg = "white", family = "serif", col.axis = "lightgrey", col.lab = "lightgrey", col.main = "lightgrey", col.sub = "lightgrey")
c_net_plot(my_graph_metaN,
vertex.color = V(my_graph_metaN)$color, vertex.size = ((V(my_graph_metaN)$size)*2),
vertex.label.color = "black",vertex.label = V(my_graph_metaN)$name, vertex.label.cex = 0.5,
edge_width_range = c(0.5, 3), edge.color = "green4", edge.curved = 0.4,
legend = F, legend_number = F,
edge_legend_title = "Weight",
size_legend = F, size_legend_title = "Abundance",
width_legend = T, width_legend_title = "Weight"
)
# vertex_attr(my_graph_metaN)
# vertex_attr(my_graph)
legend("right",
legend = unique_families,
col = unique_colors,
# title = "Families",
pch = 19,
pt.cex = 2,
cex = 0.6,
text.font = 3,
bty = "n",
y.intersp = 0.4,
inset = 6 / 10
)
legend_x <- 0.85
legend_y <- 0.9
text(x = legend_x - 2.5, y = legend_x - 0.35, labels = "Families", pos = 1, cex = 0.8, font = 4)#
# # Define the thickness range
# min_thickness <- 0.5
# max_thickness <- 3
#
# # Calculate the thickness for each weight
# thickness_values <- min_thickness + (max_thickness - min_thickness) * normalized_weights
#
# # Create a unique set of thickness values for the legend
# unique_thickness <- unique(round(thickness_values,digits = 0))
#
# # Generate labels for the thickness values
# thickness_labels <- round(unique_thickness, 2)
#
# # Add the legend for the weights
# legend("right",
# legend = thickness_labels,
# col = "black",
# lty = 1, # Line type
# lwd = unique_thickness, # Line width corresponding to thickness
# title = "Weights",
# cex = 0.5, y.intersp = 0.5,
# text.font = 3 # Italic text
# )V(my_graph_metaN)$v_class <- result$Family
c_net_plot(my_graph_metaN,
coors = g_layout_polygon(my_graph_metaN, group = "v_class")
)## get edgelist
edgelist = as_edgelist(my_graph)
order_species <- c("Trueperella_1","Corynebacterium_6", "Fusobacterium_2","Corynebacterium_1", "Corynebacterium_3", "Corynebacterium_4", "Corynebacterium_8",
"Streptococcus_1", "Peptoniphilus_1", "Mycobacterium_1", "Rhodococcus_1", "Multi-affiliation1_1", "Sphingomonas_1",
"Francisella_10", "Francisella_6", "Francisella_9", "Williamsia_2", "Corynebacterium_5", "Multi-affiliation_1", "Corynebacterium_7",
"Halomonas_2", "Mycobacterium_2", "Williamsia_1", "Nocardioides_1", "Francisella_8", "Xanthomonas_1", "Acinetobacter_2",
"Staphylococcus_3", "Staphylococcus_4", "Acinetobacter_1", "Candidatus Midichloria_1", "Francisella_4", "Francisella_5",
"Candidatus Midichloria_2", "Candidatus Midichloria_6", "Candidatus Midichloria_7")
result <- result %>% mutate(size = vertex_attr(my_graph)$size)
result_ordered <- result[match(order_species, result$Genus), ]
# Set the background color to black using par and text parameters
par(bg = "black", family = "serif", col.axis = "lightgrey", col.lab = "lightgrey", col.main = "lightgrey", col.sub = "lightgrey")
# plot arc diagram
arcplot(edgelist,
cex.labels = 0.7,
sorted = FALSE,
show.nodes = TRUE,
bg.nodes = result_ordered$Color,
cex.nodes = (result_ordered$size) * 0.8,
pch.nodes = 21,
lwd.nodes = 0.2,
line = 0,
lwd.arcs = 20 * (edge_attr(my_graph)$weight),
col.arcs = edge_attr(my_graph)$color,
col.labels = "lightgrey", # Set font color for labels
col.nodes = "black", # Set font color for node labels
bg = NA # Use NA to keep the par bg color
# above = c(1:97,99:114)
)
legend("right",
legend = unique_families,
col = unique_colors,
pch = 19,
title = "Families",
pt.cex = 2,
cex = 0.7,
text.font = 3, # Set font to Times New Roman (serif)
text.col = "lightgrey", # Set font color to light grey
bty = "n",
y.intersp = 0.5
# inset = 1 / 10
)PLN_pca_F <- PLNPCA(
formula = data_F$Abundance ~ 1 +
offset(log(Offset)),
data = data_F,
ranks = 1:20
)##
## Initialization...
##
## Adjusting 20 PLN models for PCA analysis.
## Rank approximation = 5 Rank approximation = 12 Rank approximation = 15 Rank approximation = 9 Rank approximation = 20 Rank approximation = 6 Rank approximation = 4 Rank approximation = 2 Rank approximation = 7 Rank approximation = 18 Rank approximation = 10 Rank approximation = 11 Rank approximation = 19 Rank approximation = 17 Rank approximation = 14 Rank approximation = 8 Rank approximation = 16 Rank approximation = 1 Rank approximation = 13 Rank approximation = 3
## Post-treatments
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
## Loading required package: phyloseq
##
## DONE!
sigma(PLN_pca_F_best) %>%
corrplot::corrplot(is.corr = FALSE, tl.cex = 0.5, col = corrplot::COL2("PiYG"), type = "lower", diag = T)# Define the threshold function
keep_high_correlations <- function(x) {
any(x > 2 | x < -1)
}
########################### Climatic ##################
rows_to_keep <- apply(sigma(PLN_pca_F_best), 1, keep_high_correlations)
cols_to_keep <- apply(sigma(PLN_pca_F_best), 2, keep_high_correlations)
# Subset the matrix to keep only the selected rows and columns
PLN_pca_F_best_filtered <- sigma(PLN_pca_F_best)[rows_to_keep, cols_to_keep]
# Print the head of the filtered matrix to check
# head(oaks_best_pca_filtered)
PLN_pca_F_best_filtered %>%
corrplot::corrplot(is.corr = FALSE, tl.cex = 0.4, col = corrplot::COL2("PiYG"), type = "lower", diag = T)factoextra::fviz_pca_biplot(PLN_pca_F_best,
select.var = list(contrib = 10),
labels = NULL,
geom = "point"
) # CONTRIBUTION TOP 20 CLUSTER.# Define your color palette
colors <- c(
"wild" = "#009392",
"ecotone" = "#9ccb86",
"farmed land" = "#EEB479",
"urban" = "#CF597E"
)
# Plot PCA with ordered legend
fviz_pca_ind(PLN_pca_F_best,
col.ind = data_F$Ecological.gradient,
palette = colors,
geom.ind = "point",
repel = TRUE,
pointsize = 4,
pointshape = 19,
col.var = "black",
arrowsize = 0.6,
labelsize = 4
) +
labs(color = "Ecological gradient") +
scale_color_manual(values = colors) +
guides(color = guide_legend(order = 1))## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
fviz_pca_var(PLN_pca_F_best,
col.var = "contrib",
gradient.cols = c("#00AFBB",
"#E7B800",
"#FC4E07"),
select.var = list(contrib = 20)
)PLN_pca_F_best$var %>%
as.data.frame() %>%
select(contrib.Dim.1, contrib.Dim.2) %>%
arrange(contrib.Dim.1) %>%
tail(n = 10) %>%
knitr::kable()| contrib.Dim.1 | contrib.Dim.2 | |
|---|---|---|
| Wigglesworthia | 0.4823455 | 1.6832498 |
| Buchnera | 0.4939708 | 1.8357243 |
| Staphylococcus | 0.5513401 | 1.4168548 |
| Fictibacillus | 0.5899444 | 0.0569365 |
| Candidatus Curculioniphilus | 0.8234310 | 1.7290751 |
| Cupriavidus | 1.0517940 | 43.7093708 |
| Candidatus Ishikawaella | 6.8902259 | 0.0589862 |
| Candidatus Baumannia | 10.0833813 | 3.3880136 |
| Arsenophonus | 17.7778363 | 0.3021989 |
| Halomonas | 57.7739258 | 0.4224482 |
networks_F <- PLNnetwork(formula = data_F$Abundance ~ 1 +
Ecological.gradient +
offset(log(Offset)), data = data_F)##
## Initialization...
## Adjusting 30 PLN with sparse inverse covariance estimation
## Joint optimization alternating gradient descent and graphical-lasso
## sparsifying penalty = 20.55506 sparsifying penalty = 18.98611 sparsifying penalty = 17.53692 sparsifying penalty = 16.19834 sparsifying penalty = 14.96194 sparsifying penalty = 13.8199 sparsifying penalty = 12.76504 sparsifying penalty = 11.7907 sparsifying penalty = 10.89072 sparsifying penalty = 10.05944 sparsifying penalty = 9.291615 sparsifying penalty = 8.582394 sparsifying penalty = 7.927308 sparsifying penalty = 7.322223 sparsifying penalty = 6.763324 sparsifying penalty = 6.247085 sparsifying penalty = 5.770251 sparsifying penalty = 5.329812 sparsifying penalty = 4.922992 sparsifying penalty = 4.547225 sparsifying penalty = 4.200139 sparsifying penalty = 3.879546 sparsifying penalty = 3.583423 sparsifying penalty = 3.309904 sparsifying penalty = 3.057262 sparsifying penalty = 2.823903 sparsifying penalty = 2.608357 sparsifying penalty = 2.409264 sparsifying penalty = 2.225367 sparsifying penalty = 2.055506
## Post-treatments
## DONE!
coefficient_path(networks_F, corr = TRUE) %>%
ggplot(aes(x = Penalty, y = Coeff, group = Edge, colour = Edge)) +
geom_line(show.legend = FALSE) +
coord_trans(x = "log10") +
theme_bw()##
## Stability Selection for PLNnetwork:
## subsampling: ++++++++++++++++++++
networks_F_2 <- getBestModel(networks_F, crit = "StARS", stability = 0.9)
my_graph <- plot(networks_F_2, plot = F)
Isolated <- which(igraph::degree(my_graph) == 0)
my_graph_cleaned <- igraph::delete_vertices(my_graph, Isolated)
# my_graph_cleaned <- delete_vertices(my_graph, c("Francisella_8", "Candidatus Midichloria_11"))
my_graph <- my_graph_cleaned
vertices_names <- V(my_graph)$name
taxo <- tax_table(physeq)
result <- merged_metagenomes@tax_table %>%
as.data.frame() %>%
filter(Genus %in% vertices_names) %>%
select(Genus, Family)
# Extract unique families
unique_families <- unique(result$Family)
# Generate distinct colors for each family
n_colors <- length(unique_families)
colors <- khroma::colour("smooth rainbow")(n_colors)
# Create a named vector to map each family to a color
family_colors <- setNames(colors, unique_families)
# Add the color information to the result dataframe
result <- result %>%
mutate(Color = family_colors[Family])
# extract abundance OTU
otu_table_df <- as.data.frame(otu_table(physeq))
abundance <- otu_table_df %>%
mutate(Abundance = rowSums(.)) %>%
rownames_to_column("vertices_left")
abundance_solo <- abundance %>%
as.data.frame() %>%
filter(vertices_left %in% vertices_names) %>%
select(Abundance)
# # Convert the Abundance column to a numeric vector
# vertex_sizes <- as.numeric(abundance_solo$Abundance)
# # Scale the vertex sizes for better visibility
# vertex_sizes <- (vertex_sizes / max(vertex_sizes)) * 10 # Adjust the multiplier as needed
# Assign colors to the vertices
V(my_graph)$color <- result$Color
# Assign sizes to the vertices
# V(my_graph)$size <- vertex_sizes
# Choose a layout that spreads out the vertices
layout <- igraph::layout_in_circle(my_graph)
# Define unique colors and families for the legend
unique_colors <- unique(result$Color)
unique_families <- unique(result$Family)
# Define edge width for the legend
edge_widths <- E(my_graph)$width
unique_edge_widths <- unique(edge_widths)
# Edge - Arc colors
weight <- as.numeric(E(my_graph)$weight)
color_palette <- colorRampPalette(c("yellow", "red"))(200)
normalized_weights <- (weight - min(weight)) / (max(weight) - min(weight))
edge_colors <- sapply(seq_along(normalized_weights), function(i) {
if (i == 97) {
"#009fff"
} else {
color_palette[as.integer(normalized_weights[i] * (length(color_palette) - 1)) + 1]
}
})
edge_attr(my_graph)$color <- edge_colors
E(my_graph)$color <- edge_colors
plot(simplify(my_graph),
layout = layout,
vertex.size = (V(my_graph)$size) * 2,
vertex.label.dist = 1,
vertex.label.cex = 0.5,
edge.width = E(my_graph)$width,
edge.color = E(my_graph)$color,
margin = 0.2,
main = "Network based on the GLM ~ month",
sub = "Stability threashold 0.90 - Verticles colored by families, Edges colored by weight",
rescale = T
)
legend("left",
legend = unique_families,
col = unique_colors,
pch = 19,
title = "Families",
pt.cex = 2,
cex = 0.5,
bty = "n",
y.intersp = 0.6,
inset = 1 / 10
)edgelist = as_edgelist(my_graph)
order_species <- c("Corynebacterium_3", "Corynebacterium_8", "Fusobacterium_2", "Corynebacterium_3", "Corynebacterium_4", "Francisella_10", "Francisella_9", "Corynebacterium_6", "Mycobacterium_2", "Rhodococcus_1" , "Williamsia_1" , "Williamsia_2", "Nocardioides_1" , "Sphingomonas_1", "Streptococcus_1", "Multi-affiliation_1", "Staphylococcus_3", "Staphylococcus_4","Staphylococcus_6" , "Peptoniphilus_1", "Candidatus Midichloria_2", "Candidatus Midichloria_6", "Candidatus Midichloria_7", "Candidatus Midichloria_3", "Rickettsia_3", "Candidatus Midichloria_4", "Candidatus Midichloria_5", "Rickettsia_1", "Rickettsia_11", "Rickettsia_2", "Rickettsia_6","Rickettsia_10", "Candidatus Midichloria_9","Rickettsia_7", "Rickettsia_7")
result <- result %>% mutate(size = vertex_attr(my_graph)$size)
result_ordered <- result[match(order_species, result$Genus), ]
# Set the background color to black using par and text parameters
par(bg = "black", family = "serif", col.axis = "lightgrey", col.lab = "lightgrey", col.main = "lightgrey", col.sub = "lightgrey")
# plot arc diagram
arcplot(edgelist,
cex.labels = 0.7,
sorted = FALSE,
show.nodes = TRUE,
bg.nodes = result_ordered$Color,
cex.nodes = (result_ordered$size) * 0.8,
pch.nodes = 21,
lwd.nodes = 0.2,
line = 0,
lwd.arcs = 20 * (edge_attr(my_graph)$weight),
col.arcs = edge_attr(my_graph)$color,
col.labels = "lightgrey", # Set font color for labels
col.nodes = "black", # Set font color for node labels
bg = NA # Use NA to keep the par bg color
# above = c(1:97,99:114)
)
legend("topright",
legend = unique_families,
col = unique_colors,
pch = 19,
title = "Families",
pt.cex = 2,
cex = 0.7,
text.font = 3, # Set font to Times New Roman (serif)
text.col = "lightgrey", # Set font color to light grey
bty = "n",
y.intersp = 0.45,
inset = c(-0.6, -0.02))## a 'v_group'-'v_class' should not match multiple colors, update 'color'.
## a 'e_type' should not match multiple colors, update 'color'.
V(my_graph_metaN)$v_group <- result$Family
V(my_graph_metaN)$color <- V(my_graph)$color
V(my_graph_metaN)$size <- vertex_attr(my_graph)$size
V(my_graph_metaN)$v_class <- result$Family
E(my_graph_metaN)$color <- E(my_graph)$color
# Set the background color to black using par and text parameters
par(bg = "white", family = "serif", col.axis = "lightgrey", col.lab = "lightgrey", col.main = "lightgrey", col.sub = "lightgrey")
c_net_plot(my_graph_metaN,
vertex.color = V(my_graph_metaN)$color,
vertex.size = ((V(my_graph_metaN)$size)*1),
vertex.label.color = "black",
vertex.label = V(my_graph_metaN)$name,
vertex.label.cex = 0.9,
edge_width_range = c(0.5, 3), edge.color = "#AA0000", edge.curved = 0.4,
legend = F, legend_number = F,
edge_legend_title = "Weight",
size_legend = F, size_legend_title = "Abundance",
width_legend = T, width_legend_title = "Weight",
main = "Female Network"
)
# vertex_attr(my_graph_metaN)
# vertex_attr(my_graph)
legend("right",
legend = unique_families,
col = unique_colors,
# title = "Families",
pch = 19,
pt.cex = 2,
cex = 0.6,
text.font = 3,
bty = "n",
y.intersp = 0.4,
inset = 8 / 10
)
legend_x <- 0.85
legend_y <- 0.9
text(x = legend_x - 2.5, y = legend_x - 0.35, labels = "Families", pos = 1, cex = 0.8, font = 4)PLN_pca_M <- PLNPCA(
formula = data_M$Abundance ~ 1 +
offset(log(Offset)),
data = data_M,
ranks = 1:20
)##
## Initialization...
##
## Adjusting 20 PLN models for PCA analysis.
## Rank approximation = 16 Rank approximation = 10 Rank approximation = 8 Rank approximation = 3 Rank approximation = 6 Rank approximation = 7 Rank approximation = 12 Rank approximation = 2 Rank approximation = 20 Rank approximation = 1 Rank approximation = 5 Rank approximation = 15 Rank approximation = 17 Rank approximation = 18 Rank approximation = 14 Rank approximation = 11 Rank approximation = 9 Rank approximation = 13 Rank approximation = 19 Rank approximation = 4
## Post-treatments
## DONE!
sigma(PLN_pca_M_best) %>%
corrplot::corrplot(is.corr = FALSE, tl.cex = 0.5, col = corrplot::COL2("PiYG"), type = "lower", diag = T)# Define the threshold function
keep_high_correlations <- function(x) {
any(x > 2 | x < -1)
}
########################### Climatic ##################
rows_to_keep <- apply(sigma(PLN_pca_M_best), 1, keep_high_correlations)
cols_to_keep <- apply(sigma(PLN_pca_M_best), 2, keep_high_correlations)
# Subset the matrix to keep only the selected rows and columns
PLN_pca_M_best_filtered <- sigma(PLN_pca_M_best)[rows_to_keep, cols_to_keep]
# Print the head of the filtered matrix to check
# head(oaks_best_pca_filtered)
PLN_pca_M_best_filtered %>%
corrplot::corrplot(is.corr = FALSE, tl.cex = 0.4, col = corrplot::COL2("PiYG"), type = "lower", diag = T)factoextra::fviz_pca_biplot(PLN_pca_M_best,
select.var = list(contrib = 10),
labels = NULL,
geom = "point"
) # CONTRIBUTION TOP 20 CLUSTER.# Define your color palette
colors <- c(
"wild" = "#009392",
"ecotone" = "#9ccb86",
"farmed land" = "#EEB479",
"urban" = "#CF597E"
)
# Plot PCA with ordered legend
fviz_pca_ind(PLN_pca_M_best,
col.ind = data_M$Ecological.gradient,
palette = colors,
geom.ind = "point",
repel = TRUE,
pointsize = 4,
pointshape = 19,
col.var = "black",
arrowsize = 0.6,
labelsize = 4
) +
labs(color = "Ecological gradient") +
scale_color_manual(values = colors) +
guides(color = guide_legend(order = 1))## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
fviz_pca_var(PLN_pca_M_best,
col.var = "contrib",
gradient.cols = c("#00AFBB",
"#E7B800",
"#FC4E07"),
select.var = list(contrib = 20)
)PLN_pca_M_best$var %>%
as.data.frame() %>%
select(contrib.Dim.1, contrib.Dim.2) %>%
arrange(contrib.Dim.1) %>%
tail(n = 10) %>%
knitr::kable()| contrib.Dim.1 | contrib.Dim.2 | |
|---|---|---|
| Geobacillus | 2.610976 | 0.0009065 |
| Streptococcus | 2.658733 | 0.0408484 |
| endosymbionts | 3.067242 | 13.4215891 |
| Enterococcus | 3.069845 | 0.0073316 |
| Solibacillus | 3.111592 | 0.0069872 |
| Fictibacillus | 3.485013 | 26.3546178 |
| Staphylococcus | 3.842718 | 1.6260811 |
| Bacillus | 4.732656 | 1.3751339 |
| Candidatus Baumannia | 7.879148 | 4.6620008 |
| Halomonas | 42.043490 | 10.9909085 |
networks_M <- PLNnetwork(formula = data_M$Abundance ~ 1 +
Ecological.gradient +
offset(log(Offset)), data = data_M)##
## Initialization...
## Adjusting 30 PLN with sparse inverse covariance estimation
## Joint optimization alternating gradient descent and graphical-lasso
## sparsifying penalty = 41.36325 sparsifying penalty = 38.20603 sparsifying penalty = 35.2898 sparsifying penalty = 32.59616 sparsifying penalty = 30.10812 sparsifying penalty = 27.80999 sparsifying penalty = 25.68728 sparsifying penalty = 23.72659 sparsifying penalty = 21.91556 sparsifying penalty = 20.24276 sparsifying penalty = 18.69765 sparsifying penalty = 17.27048 sparsifying penalty = 15.95224 sparsifying penalty = 14.73462 sparsifying penalty = 13.60993 sparsifying penalty = 12.5711 sparsifying penalty = 11.61156 sparsifying penalty = 10.72526 sparsifying penalty = 9.906609 sparsifying penalty = 9.150446 sparsifying penalty = 8.452 sparsifying penalty = 7.806866 sparsifying penalty = 7.210975 sparsifying penalty = 6.660567 sparsifying penalty = 6.152172 sparsifying penalty = 5.682582 sparsifying penalty = 5.248835 sparsifying penalty = 4.848196 sparsifying penalty = 4.478137 sparsifying penalty = 4.136325
## Post-treatments
## DONE!
coefficient_path(networks_M, corr = TRUE) %>%
ggplot(aes(x = Penalty, y = Coeff, group = Edge, colour = Edge)) +
geom_line(show.legend = FALSE) +
coord_trans(x = "log10") +
theme_bw()##
## Stability Selection for PLNnetwork:
## subsampling: ++++++++++++++++++++
networks_M_2 <- getBestModel(networks_M, crit = "StARS", stability = 0.9)
my_graph <- plot(networks_M_2, plot = F)
Isolated <- which(igraph::degree(my_graph) == 0)
my_graph_cleaned <- igraph::delete_vertices(my_graph, Isolated)
# my_graph_cleaned <- delete_vertices(my_graph, c("Francisella_8", "Candidatus Midichloria_11"))
my_graph <- my_graph_cleaned
vertices_names <- V(my_graph)$name
taxo <- tax_table(physeq)
result <- merged_metagenomes@tax_table %>%
as.data.frame() %>%
filter(Genus %in% vertices_names) %>%
select(Genus, Family)
# Extract unique families
unique_families <- unique(result$Family)
# Generate distinct colors for each family
n_colors <- length(unique_families)
colors <- khroma::colour("smooth rainbow")(n_colors)
# Create a named vector to map each family to a color
family_colors <- setNames(colors, unique_families)
# Add the color information to the result dataframe
result <- result %>%
mutate(Color = family_colors[Family])
# extract abundance OTU
otu_table_df <- as.data.frame(otu_table(physeq))
abundance <- otu_table_df %>%
mutate(Abundance = rowSums(.)) %>%
rownames_to_column("vertices_left")
abundance_solo <- abundance %>%
as.data.frame() %>%
filter(vertices_left %in% vertices_names) %>%
select(Abundance)
# # Convert the Abundance column to a numeric vector
# vertex_sizes <- as.numeric(abundance_solo$Abundance)
# # Scale the vertex sizes for better visibility
# vertex_sizes <- (vertex_sizes / max(vertex_sizes)) * 10 # Adjust the multiplier as needed
# Assign colors to the vertices
V(my_graph)$color <- result$Color
# Assign sizes to the vertices
# V(my_graph)$size <- vertex_sizes
# Choose a layout that spreads out the vertices
layout <- igraph::layout_in_circle(my_graph)
# Define unique colors and families for the legend
unique_colors <- unique(result$Color)
unique_families <- unique(result$Family)
# Define edge width for the legend
edge_widths <- E(my_graph)$width
unique_edge_widths <- unique(edge_widths)
# Edge - Arc colors
weight <- as.numeric(E(my_graph)$weight)
color_palette <- colorRampPalette(c("yellow", "red"))(200)
normalized_weights <- (weight - min(weight)) / (max(weight) - min(weight))
edge_colors <- sapply(seq_along(normalized_weights), function(i) {
if (i == 97) {
"#009fff"
} else {
color_palette[as.integer(normalized_weights[i] * (length(color_palette) - 1)) + 1]
}
})
edge_attr(my_graph)$color <- edge_colors
E(my_graph)$color <- edge_colors
plot(simplify(my_graph),
layout = layout,
vertex.size = (V(my_graph)$size) * 2,
vertex.label.dist = 1,
vertex.label.cex = 0.5,
edge.width = E(my_graph)$width,
edge.color = E(my_graph)$color,
margin = 0.2,
main = "Network based on the GLM ~ month",
sub = "Stability threashold 0.90 - Verticles colored by families, Edges colored by weight",
rescale = T
)
legend("left",
legend = unique_families,
col = unique_colors,
pch = 19,
title = "Families",
pt.cex = 2,
cex = 0.5,
bty = "n",
y.intersp = 0.6,
inset = 1 / 10
)edgelist = as_edgelist(my_graph)
order_species <- c("Corynebacterium_3", "Corynebacterium_8", "Fusobacterium_2", "Corynebacterium_3", "Corynebacterium_4", "Francisella_10", "Francisella_9", "Corynebacterium_6", "Mycobacterium_2", "Rhodococcus_1" , "Williamsia_1" , "Williamsia_2", "Nocardioides_1" , "Sphingomonas_1", "Streptococcus_1", "Multi-affiliation_1", "Staphylococcus_3", "Staphylococcus_4","Staphylococcus_6" , "Peptoniphilus_1", "Candidatus Midichloria_2", "Candidatus Midichloria_6", "Candidatus Midichloria_7", "Candidatus Midichloria_3", "Rickettsia_3", "Candidatus Midichloria_4", "Candidatus Midichloria_5", "Rickettsia_1", "Rickettsia_11", "Rickettsia_2", "Rickettsia_6","Rickettsia_10", "Candidatus Midichloria_9","Rickettsia_7", "Rickettsia_7")
result <- result %>% mutate(size = vertex_attr(my_graph)$size)
result_ordered <- result[match(order_species, result$Genus), ]
# Set the background color to black using par and text parameters
par(bg = "black", family = "serif", col.axis = "lightgrey", col.lab = "lightgrey", col.main = "lightgrey", col.sub = "lightgrey")
# plot arc diagram
arcplot(edgelist,
cex.labels = 0.7,
sorted = FALSE,
show.nodes = TRUE,
bg.nodes = result_ordered$Color,
cex.nodes = (result_ordered$size) * 0.8,
pch.nodes = 21,
lwd.nodes = 0.2,
line = 0,
lwd.arcs = 20 * (edge_attr(my_graph)$weight),
col.arcs = edge_attr(my_graph)$color,
col.labels = "lightgrey", # Set font color for labels
col.nodes = "black", # Set font color for node labels
bg = NA # Use NA to keep the par bg color
# above = c(1:97,99:114)
)
legend("topright",
legend = unique_families,
col = unique_colors,
pch = 19,
title = "Families",
pt.cex = 2,
cex = 0.7,
text.font = 3, # Set font to Times New Roman (serif)
text.col = "lightgrey", # Set font color to light grey
bty = "n",
y.intersp = 0.45,
inset = c(-0.6, -0.02))## a 'v_group'-'v_class' should not match multiple colors, update 'color'.
## a 'e_type' should not match multiple colors, update 'color'.
V(my_graph_metaN)$v_group <- result$Family
V(my_graph_metaN)$color <- V(my_graph)$color
V(my_graph_metaN)$size <- vertex_attr(my_graph)$size
V(my_graph_metaN)$v_class <- result$Family
E(my_graph_metaN)$color <- E(my_graph)$color
# Set the background color to black using par and text parameters
par(bg = "white", family = "serif", col.axis = "lightgrey", col.lab = "lightgrey", col.main = "lightgrey", col.sub = "lightgrey")
c_net_plot(my_graph_metaN,
vertex.color = V(my_graph_metaN)$color,
vertex.size = ((V(my_graph_metaN)$size)*1),
vertex.label.color = "black",
vertex.label = V(my_graph_metaN)$name,
vertex.label.cex = 0.9,
edge_width_range = c(0.5, 3), edge.color = "#AA0000", edge.curved = 0.4,
legend = F, legend_number = F,
edge_legend_title = "Weight",
size_legend = F, size_legend_title = "Abundance",
width_legend = T, width_legend_title = "Weight",
main = "Male Network"
)
# vertex_attr(my_graph_metaN)
# vertex_attr(my_graph)
legend("right",
legend = unique_families,
col = unique_colors,
# title = "Families",
pch = 19,
pt.cex = 2,
cex = 0.6,
text.font = 3,
bty = "n",
y.intersp = 0.4,
inset = 8 / 10
)
legend_x <- 0.85
legend_y <- 0.9
text(x = legend_x - 2.5, y = legend_x - 0.35, labels = "Families", pos = 1, cex = 0.8, font = 4)